Clustering

Optional exercise

Follow the tutorial from the clustering section of the book PH525x series - Biomedical Data Science.

Install the package tissuesGeneExpression by running in R:

Clustering gene expression data in healthy tissues

The aim of the exercise is to find out if the samples belonging to different tissues in different species cluster together by species or by tissue. To answer this:

From the three datasets, keep only tissues belonging to the following categories:

## Data cleaning and preparation: to cluster based on gene expression data, we first need to make sure that the rat, mouse and human datasets are cleaned and combined in the right format before clustering.
install.packages("biomaRt")  # Install biomaRt if not installed
## Installing package into '/home/alexiosgiannoulas/R/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
## Warning: package 'biomaRt' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)
library(biomaRt)

tissues <-  c( "brain", "colon",  "duodenum",  "esophagus" , "heart",   "ileum",   "jejunum", "kidney",  "liver",   "pancreas",  "quadriceps" , "stomach", "thymus" )

# Read the rat, mouse and human tables
table_rat <- read.table("data/E-MTAB-6081/rat_tpm.txt", sep= "\t", header=TRUE)
# Move row names (gene IDs) to an ordinary column and introduce a progressive ID to check correctly for duplicate rows. Otherwise, all rows with identical measurements will be marked as duplicates even if they have different gene IDs. Do the same for the mouse table
table_rat <- table_rat %>% rownames_to_column(var = "Gene_ID")

table_mouse <- read.table("data/E-MTAB-6081/mouse_tpm.txt", sep= "\t", header=TRUE)
table_mouse <- table_mouse %>% rownames_to_column(var = "Gene_ID")

# Read the table for human samples
table_human <- read.delim("data/GTEx_Analysis_v10_RNASeQCv2.4.2_gene_median_tpm.gct/GTEx_Analysis_2022-06-06_v10_RNASeQCv2.4.2_gene_median_tpm.gct", skip = 2, header = TRUE)

# Remove the decimal part in an Ensembl Gene ID to ensure consistency across tables
table_human$Name <- sub("\\..*", "", table_human$Name)

# Check for duplicates in the three tables
any(duplicated(table_rat))      # returns FALSE
## [1] FALSE
any(duplicated(table_mouse))    # returns FALSE
## [1] FALSE
any(duplicated(table_human))    # returns TRUE
## [1] TRUE
# Create a new data frame with all duplicated rows in table_human
duplicates_human_all <- table_human[duplicated(table_human$Name) | duplicated(table_human$Name, fromLast = TRUE), ]
# When exploring a few examples, we notice that there is usually a row with entries different from 0 and a row with all entries equal to 0 for a duplicate gene

# Remove rows where all values (except the "Name" column) are 0 in table_human, to filter out biologically irrelevant genes and since genes with all zeros across tissues do not provide any meaningful information for clustering
table_human_filtered <- table_human %>%
  filter(rowSums(across(where(is.numeric))) > 0)  # Keep rows where at least one value is greater than 0
#Check for duplicates again
any(duplicated(table_human_filtered))    # returns FALSE
## [1] FALSE
# Prepare human data for joining later. First, rename "Name" column
colnames(table_human_filtered)[1] <- "gene"
# Delete gene column, use description as gene name. When multiple genes have the same description, keep the one with the maximum expression
human_filtered <- table_human_filtered %>%
  dplyr::select(-gene) %>%
  dplyr::rename(gene = Description) %>%
  group_by(gene) %>%
  summarise(across(everything(), max)) %>% 
  ungroup()

# Convert column names to lowercase
colnames(human_filtered) <- tolower(colnames(human_filtered))

# Convert gene column to lowercase
human_filtered$gene <- tolower(human_filtered$gene)
# Make sure there are no duplicates
any(duplicated(human_filtered$gene))   # this returns FALSE
## [1] FALSE
# Similarly, remove rows with all zeros in table_mouse and table_rat
table_mouse_filtered <- table_mouse %>%
  filter(rowSums(across(where(is.numeric))) > 0)
table_rat_filtered <- table_rat %>%
  filter(rowSums(across(where(is.numeric))) > 0)

Now that we prepared the data, we proceed to match the Gene IDs for mouse, rat and human using Biomart

## Preparation steps to join the three tables
# Connect to Ensembl Biomart (Mouse dataset)
mouse_mart <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
# Obtain Ensembl Gene IDs and their corresponding external gene names
mouse_gene_mapping <- getBM(
  attributes = c("ensembl_gene_id", "external_gene_name"),
  mart = mouse_mart
)
#saveRDS(mouse_gene_mapping, file = "mouse_gene_mapping.rds")

# Remove empty names to avoid issues later on
mouse_gene_mapping <- mouse_gene_mapping[mouse_gene_mapping$external_gene_name != "", ]

# Merge table_mouse_filtered with the mouse_gene_mapping data
mouse_external_names <-left_join(table_mouse_filtered, mouse_gene_mapping, by = c("Gene_ID" = "ensembl_gene_id"))

# Rename the column external_gene_name to Gene
mouse_external_names <- mouse_external_names %>%
  rename(Gene = external_gene_name)
# Remove the original Gene_ID column
mouse_external_names <- mouse_external_names[, !names(mouse_external_names) %in% "Gene_ID"]
# If multiple rows exist for the same gene, merge them by keeping the highest expression values per column
mouse_external_names <- aggregate(. ~ Gene, data = mouse_external_names, FUN = max)

# Add an identifier for mouse genes that will be later used, transform to lowercase
mouse_external_names <- mouse_external_names %>%
  rename_with(~ gsub("X199_", "X199_mouse_", .x) %>% tolower())
# As the tip in the hands-on suggested, converting all column names to lower case
mouse_external_names$gene <- tolower(mouse_external_names$gene) 
# Check for duplicate rows
any(duplicated(mouse_external_names$gene))   # this returns FALSE
## [1] FALSE
## Next, we do the same steps for rat

# Connect to Ensembl Biomart to get the rat gene name mapping
rat_mart <- useMart("ensembl", dataset = "rnorvegicus_gene_ensembl")
rat_gene_mapping <- getBM(
  attributes = c("ensembl_gene_id", "external_gene_name"),
  mart = rat_mart
)
#saveRDS(rat_gene_mapping, file = "rat_gene_mapping.rds")
rat_gene_mapping <- rat_gene_mapping[rat_gene_mapping$external_gene_name != "", ]

# Merge data
rat_external_names <-left_join(table_rat_filtered, rat_gene_mapping, by = c("Gene_ID" = "ensembl_gene_id"))
rat_external_names <- rat_external_names %>%
  rename(Gene = external_gene_name)
rat_external_names <- rat_external_names[, !names(rat_external_names) %in% "Gene_ID"]

# If multiple rows exist for the same gene, merge them as done for mouse
rat_external_names <- aggregate(. ~ Gene, data = rat_external_names, FUN = max)   

# Add an identifier for mouse genes that will be later used
colnames(rat_external_names) <- gsub("X199_","X199_rat_", colnames(rat_external_names)) %>% tolower()
# As the tip in the hands-on suggested, converting all column names to lower case
rat_external_names$gene <- tolower(rat_external_names$gene) 

any(duplicated(rat_external_names$gene))   # this returns FALSE
## [1] FALSE

Next step is to join the three tables

# Join the tables
join_table <- human_filtered %>% 
  inner_join(mouse_external_names, by = "gene") %>% 
  inner_join(rat_external_names, by = "gene")

any(duplicated(join_table$gene))   # this returns FALSE
## [1] FALSE
saveRDS(join_table, file = "join_table_v1.rds")

join_table_2 <- join_table %>% 
  #dplyr::select(gene, contains(tissues)) %>%
  column_to_rownames("gene") %>%  
  t()

# Create a regex pattern
pattern <- paste(tissues, collapse = "|")

# Filter rows where row names contain any of the tissue names
join_table_3 <- join_table_2[grep(pattern, rownames(join_table_2), ignore.case = TRUE), ]

# Before further processing, make sure we have a data frame. Add a tissue column whose values are the row names
join_table_3 <- as.data.frame(join_table_3)
join_table_3$tissue <- rownames(join_table_3) 
rownames(join_table_3) <- NULL  # Remove row names to avoid redundancy
join_table_3 <- join_table_3[c("tissue", setdiff(names(join_table_3), c("tissue")))] # Make tissue to be the first column for better readability

# Add a tissue category column, from 'tissue'
join_table_3$tissue_category <- stringr::str_extract(join_table_3$tissue, pattern)

# Add the species column
join_table_3$species <- ifelse(grepl("rat", join_table_3$tissue, ignore.case = TRUE), "rat",
                               ifelse(grepl("mouse", join_table_3$tissue, ignore.case = TRUE), "mouse", "human"))

# Reorder columns for better readability
join_table_3 <- join_table_3[, c("tissue", "tissue_category", "species", setdiff(names(join_table_3), c("tissue", "tissue_category", "species")))]

#saveRDS(join_table_3, file = "join_table_for_clustering.rds")

Bear in mind that there is not an exact match between the tissues across the different datasets.
pro tip Do not manually copy from the column names. Convert all column names to lower case, and split them appropriately.

Cluster the tissues using gene expression data. Run k-means and hierarchical clustering. For each algorithm, determine the optimal number of clusters.

library(ggplot2)
library(dplyr)
library(ggfortify)
library(cluster)
library(caret)
## Loading required package: lattice
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:biomaRt':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(Rtsne)
library(pheatmap)
library(umap)
###To start from this part without running the previous steps run this command
#join_table_3 <- readRDS(file = "join_table_for_clustering.rds")

#Before we do either k-means or hierarchical clustering, we need to preprocess the data. We will start by removing genes with near-zero variance, low expression, and selecting the top 2000 variable genes by variance. We will then scale the data and perform the clustering.

#Make 2 annotation vectors for tissue and species
tissue <- join_table_3$tissue_category
species <- join_table_3$species
# Keep only gene expression columns (i.e. remove categorical columns)
gene_data <- join_table_3 %>%
  dplyr::select(-c(tissue, tissue_category, species))

# Convert to numeric matrix
gene_matrix <- as.matrix(gene_data)
# Apply log2 transformation to normalize gene expression values
gene_matrix <- log2(gene_matrix + 1)
dim(gene_matrix)  # 14641 genes
## [1]   118 14638
# Identify and remove genes with near-zero variance
variances <- apply(gene_matrix, 2, var)
near_zero_variance <- which(variances < 1e-2)  
#delete columns with near 0 variance
gene_matrix <- gene_matrix[, -near_zero_variance]
dim(gene_matrix)  # 14641 genes -> 14258 genes
## [1]   118 14258
#Remove low-expression genes based on mean expression threshold
gene_means <- colMeans(gene_matrix)
filtered_matrix <- gene_matrix[, gene_means > 1]
dim(filtered_matrix) # 14258 genes -> 10115 genes
## [1]   118 10115
# Calculate variance for each gene
gene_variances <- apply(filtered_matrix, 2, var)
# Select top 2000 variable genes 
sorted_variance <- gene_variances[order(-gene_variances)]
top_genes <- names(sorted_variance)[1:2000]
# Subset gene expression data for the top 2000 genes
gene_matrix <- filtered_matrix[, top_genes]
# Scale the data (mean = 0, variance = 1)
gene_matrix <- scale(gene_matrix)
rownames(gene_matrix) <- join_table_3$tissue
set.seed(100)  # Set seed for reproducibility

## K-MEANS

# We have 13 different tissue types. To give some flexibility on the clustering, including the possibility subclusters could exist within a tissue type, we will try k from 1 to 20
wss <- sapply(1:20, function(k) {
  kmeans(gene_matrix, centers = k, nstart = 50, iter.max = 100)$tot.withinss
})

plot(1:20, wss, type = "b", xlab = "Number of Clusters", ylab = "Within-Cluster Sum of Squares", main = "Elbow Method for choosing k")

#Use the Silhouette method to determine the optimal number of clusters for tissue clustering
sil <- sapply(2:20, function(k) {
  kmeans_result <- kmeans(gene_matrix, centers = k, nstart = 50, iter.max = 100)
  mean(silhouette(kmeans_result$cluster, dist(gene_matrix))[, 3])
})
plot(2:20, sil, type = "b", xlab = "Number of Clusters", ylab = "Average Silhouette Width")

# We can see that for k=2 (first point in plot) and k=3 there are high silhouette widths. On a first look this makes sense because if we compare the human transcriptome with the ones of the mouse and the rat, human would cluster seperately from the other 2 species which explains k=2 but since k=3 has also high silhouette width, this means that the mouse and rat transcriptomes are different enough to be clustered separately as well. 
#For tissue type clustering we get high scores for 17 and 19 clusters. We choose 17 
# Run k-means with 17 clusters
kmeans_result <- kmeans(gene_matrix, centers = 17, nstart = 25)

# Add cluster labels to the original data
join_table_3$cluster <- as.factor(kmeans_result$cluster)


# Set colors for tissue types
colorblind_colors <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", 
                       "#D55E00", "#CC79A7", "#999999", "#660099", "#009999", 
                       "#666600", "#FF33CC", "#33CCFF") 
tissue_levels <- unique(tissue)  # Get unique tissue types
tissue_colors <- setNames(colorblind_colors[1:length(tissue_levels)], tissue_levels)
#Set colors for species
species_colors <- c("rat" = "#0072B2", "mouse" = "#D55E00", "human" = "#009E73")
species_levels <- unique(species)
species_colors <- setNames(species_colors[1:length(species_levels)], species_levels)
# Visualize the distribution of tissue types in each cluster
tissue_clusters <- ggplot(join_table_3, aes(x = cluster, fill = tissue_category)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = tissue_colors) + 
  labs(title = "Tissue Distribution by Cluster (K-Means)", x = "Cluster", y = "Proportion") +
  theme_minimal() 
tissue_clusters

## HIERARCHICAL CLUSTERING
library(rafalib)
mypar()
# Compute the Euclidean distance between samples for hierarchical clustering
distance_matrix <- dist(gene_matrix)
# Perform hierarchical clustering using the calculated distance matrix
hc <- hclust(distance_matrix)
#Visualize hierarchical clustering results
myplclust(hc, 
          labels = tissue, 
          lab.col = tissue_colors[tissue],  
          cex = 0.5, 
          main = "Hierarchical Clustering of Tissues")
# Add a horizontal line at height = 45 to visually cut the tree into clusters
abline(h = 40, col = "red") 

# Assign hierarchical clusters based on tree cutting
hclusters <- cutree(hc, h=40)
# Create a table to check the distribution of tissues across hierarchical clusters
table(true=tissue, cluster=hclusters)
##             cluster
## true          1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
##   brain      13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6
##   colon       0  2  3  0  0  0  0  0  0  0  0  3  3  0  0  0  0
##   duodenum    0  0  0  0  0  0  0  0  0  0  0  3  3  0  0  0  0
##   esophagus   0  2  0  1  0  0  0  0  0  0  0  0  0  0  6  0  0
##   heart       0  0  0  0  2  0  0  0  0  0  0  0  0  0  5  1  0
##   ileum       0  0  2  0  0  0  0  0  1  0  0  2  3  0  0  1  0
##   jejunum     0  0  0  0  0  0  0  0  0  0  0  3  3  0  0  0  0
##   kidney      0  0  0  2  0  0  0  0  0  0  0  0  0  6  0  0  0
##   liver       0  0  0  0  0  3  1  0  0  0  6  0  0  0  0  0  0
##   pancreas    0  0  0  0  0  0  0  4  0  6  0  0  0  0  0  0  0
##   quadriceps  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6  0  0
##   stomach     0  1  0  3  0  0  0  0  0  0  0  6  0  0  0  0  0
##   thymus      0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6  0
# Add cluster labels to the original dataset
join_table_3$cluster_hc <- as.factor(hclusters)
# Perform PCA for visualization
pca_result <- prcomp(gene_matrix)
# Plot PCA colored by hierarchical clusters
PCA_HC <- autoplot(pca_result, data = join_table_3, colour = "cluster_hc",
         label = FALSE, label.size = 3, main = "Tissue Clusters (Hierarchical Clustering)") +
  theme_minimal() + 
  theme(panel.background = element_rect(fill = "white", colour = "black"),
        plot.background = element_rect(fill = "white"),
        panel.grid.major = element_line(colour = "gray90"),  
        panel.grid.minor = element_blank())
PCA_HC

# Check how tissue types are distributed across the hierarchical clusters
table(join_table_3$tissue_category, join_table_3$cluster_hc)
##             
##               1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
##   brain      13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6
##   colon       0  2  3  0  0  0  0  0  0  0  0  3  3  0  0  0  0
##   duodenum    0  0  0  0  0  0  0  0  0  0  0  3  3  0  0  0  0
##   esophagus   0  2  0  1  0  0  0  0  0  0  0  0  0  0  6  0  0
##   heart       0  0  0  0  2  0  0  0  0  0  0  0  0  0  5  1  0
##   ileum       0  0  2  0  0  0  0  0  1  0  0  2  3  0  0  1  0
##   jejunum     0  0  0  0  0  0  0  0  0  0  0  3  3  0  0  0  0
##   kidney      0  0  0  2  0  0  0  0  0  0  0  0  0  6  0  0  0
##   liver       0  0  0  0  0  3  1  0  0  0  6  0  0  0  0  0  0
##   pancreas    0  0  0  0  0  0  0  4  0  6  0  0  0  0  0  0  0
##   quadriceps  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6  0  0
##   stomach     0  1  0  3  0  0  0  0  0  0  0  6  0  0  0  0  0
##   thymus      0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6  0
# Plot the proportion of tissue types in each hierarchical cluster to understand the clustering result
tissue_clusters_HC <- ggplot(join_table_3, aes(x = cluster_hc, fill = tissue_category)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = tissue_colors) +  # Explicitly set colors
  labs(title = "Tissue Distribution by Cluster HC", x = "Cluster", y = "Proportion") +
  theme_minimal()
tissue_clusters_HC

Compare the clustering results using both methodologies, and with the tissues/species. Show the results of the final partitions as a table.

# Compare the clustering results using k-means and hierarchical clustering
grid.arrange(tissue_clusters, tissue_clusters_HC, ncol = 2)

# Compute contingency tables for cluster assignments
kmeans_partition <- table(join_table_3$tissue_category, join_table_3$cluster)
hc_partition <- table(join_table_3$tissue_category, join_table_3$cluster_hc)
kmeans_partition
##             
##               1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
##   brain       0  0  0  0  0  0  0  0  0  0  0  0  6 13  0  0  0
##   colon       0  0  3  0  2  0  0  0  0  0  6  0  0  0  0  0  0
##   duodenum    0  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   esophagus   0  0  0  0  2  0  1  6  0  0  0  0  0  0  0  0  0
##   heart       0  0  0  0  2  0  0  0  5  0  0  1  0  0  0  0  0
##   ileum       0  5  3  0  0  0  0  0  0  0  0  0  0  0  0  0  1
##   jejunum     0  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   kidney      0  0  0  0  0  0  2  0  0  0  0  0  0  0  6  0  0
##   liver       3  0  0  6  0  0  1  0  0  0  0  0  0  0  0  0  0
##   pancreas    0  0  0  0  0  0  0  0  0  4  0  0  0  0  0  6  0
##   quadriceps  0  0  0  0  0  0  0  0  6  0  0  0  0  0  0  0  0
##   stomach     0  0  0  0  1  6  3  0  0  0  0  0  0  0  0  0  0
##   thymus      0  0  0  0  0  0  0  0  0  0  0  3  0  0  0  0  3
hc_partition
##             
##               1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
##   brain      13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6
##   colon       0  2  3  0  0  0  0  0  0  0  0  3  3  0  0  0  0
##   duodenum    0  0  0  0  0  0  0  0  0  0  0  3  3  0  0  0  0
##   esophagus   0  2  0  1  0  0  0  0  0  0  0  0  0  0  6  0  0
##   heart       0  0  0  0  2  0  0  0  0  0  0  0  0  0  5  1  0
##   ileum       0  0  2  0  0  0  0  0  1  0  0  2  3  0  0  1  0
##   jejunum     0  0  0  0  0  0  0  0  0  0  0  3  3  0  0  0  0
##   kidney      0  0  0  2  0  0  0  0  0  0  0  0  0  6  0  0  0
##   liver       0  0  0  0  0  3  1  0  0  0  6  0  0  0  0  0  0
##   pancreas    0  0  0  0  0  0  0  4  0  6  0  0  0  0  0  0  0
##   quadriceps  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6  0  0
##   stomach     0  1  0  3  0  0  0  0  0  0  0  6  0  0  0  0  0
##   thymus      0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6  0

Plot a heatmap of the 50 genes with top variance over all samples. Add the information about tissue groups and model (human, rat and mouse) as annotations in the heatmap*.

# Get the top 50 genes with the highest variance
top50_genes <- names(sorted_variance)[1:50]
# Subset the gene matrix for the top 50 genes
top50_matrix <- gene_matrix[, top50_genes]
top50_matrix <- t(top50_matrix)
#Create dataframe for annotation 
annotation_frame <- join_table_3[, c("tissue_category", "species")]
rownames(annotation_frame) <- join_table_3$tissue
#Generate heatmap with tissue and species annotations
color_palette <- colorRampPalette(c("#FF7F0E", "white", "#1F77B4"))(100)
breaks <- c(seq(min(top50_matrix), 0, length.out = 51), seq(0 + .Machine$double.eps, max(top50_matrix), length.out = 50))
pheatmap(top50_matrix, 
         annotation_col = annotation_frame[, c("tissue_category", "species")],
         annotation_colors = list(tissue_category = tissue_colors, species = species_colors),
         color = color_palette,
         main = "Top 50 Genes with Highest Variance",
         breaks = breaks)

Exercise 2: Dimensionality reduction

PCA

With the gene expression for different tissues and models, perform a PCA on the data and visualize the results (PC1 and PC2, and also, PC3 ). Label the points in the plot with their respective tissues/models.

#We already run PCA for HC so we use the PCA result from the previous exercise
#Perform dimensionality reduction using PCA, labeling by tissue type
PCA_tiss <- autoplot(pca_result, data = join_table_3, colour = "tissue_category",
         label = FALSE, label.size = 3, main = "PCA colored by tissue type") +
  theme_minimal() + 
  theme(panel.background = element_rect(fill = "white", colour = "black"),
        plot.background = element_rect(fill = "white"),
        panel.grid.major = element_line(colour = "gray90"),  
        panel.grid.minor = element_blank())
PCA_tiss

# Create a 3D scatter plot for PC1, PC2, and PC3
pca_data <- data.frame(pca_result$x)
pca_3d <- plot_ly(data = pca_data, 
                  x = ~PC1, 
                  y = ~PC2, 
                  z = ~PC3, 
                  color = join_table_3$tissue_category,  # Color points by tissue category
                  text = join_table_3$tissue_category,  # Show tissue/category in the hover label
                  type = "scatter3d", 
                  mode = "markers") %>%
  layout(title = "3D PCA colored by tissue type", 
         scene = list(xaxis = list(title = 'PC1'),
                      yaxis = list(title = 'PC2'),
                      zaxis = list(title = 'PC3')))
pca_3d
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
#Color by species
PCA_species <- autoplot(pca_result, data = join_table_3, colour = "species",
         label = FALSE, label.size = 3, main = "PCA colored by species") +
  theme_minimal() + 
  theme(panel.background = element_rect(fill = "white", colour = "black"),
        plot.background = element_rect(fill = "white"),
        panel.grid.major = element_line(colour = "gray90"),  
        panel.grid.minor = element_blank())
PCA_species

#Create a 3D scatter plot for PC1, PC2, and PC3 colored by species
pca_3d_species <- plot_ly(data = pca_data, 
                  x = ~PC1, 
                  y = ~PC2, 
                  z = ~PC3, 
                  color = join_table_3$species,  
                  text = join_table_3$species,  
                  type = "scatter3d", 
                  mode = "markers") %>%
  layout(title = "3D PCA colored by species", 
         scene = list(xaxis = list(title = 'PC1'),
                      yaxis = list(title = 'PC2'),
                      zaxis = list(title = 'PC3')))
pca_3d_species

Visualize the data using the PC1 and PC2 again, but this time, color the observations by cluster, using the k means clusters, with k of your choice. Produce a caption for the plot

# Plot clusters in PCA space
PCA_kmeans <- autoplot(pca_result, data = join_table_3, colour = "cluster",
         label = FALSE, label.size = 3, main = "PCA colored by k-means cluster") +
  theme_minimal() + 
  theme(panel.background = element_rect(fill = "white", colour = "black"),
        plot.background = element_rect(fill = "white"),
        panel.grid.major = element_line(colour = "gray90"),  
        panel.grid.minor = element_blank())
PCA_kmeans

#Show PCA results colored by tissue and species side by side
grid.arrange(PCA_species, PCA_tiss, ncol = 2)

What are the top 50 genes that contribute to the PC1? Are they the same genes that are more variable according to the exercise 1?

# Get the top 50 genes contributing to PC1
top_genes_pc1 <- pca_result$rotation[, 1] %>% sort(decreasing = TRUE) %>% names() %>% head(50)
#Get the top 50 genes with the highest variance
top_genes_variance <- names(sort(gene_variances, decreasing = TRUE))[1:50]
#How similar are the two lists?
sum(top_genes_pc1 %in% top_genes_variance)  # 50
## [1] 10
#Only 10 genes are in common between the two lists. This is expected since the top 50 genes contributing to PC1 are not necessarily the top 50 most variable genes,
#because PCA is focused on capturing global patterns in the data, and not just variability.

tSNE

Perform t-SNE on the dataset and visualize the results. Test at least 2 perplexity values.

#t-SNE on gene_matrix
set.seed(100)  # Set seed for reproducibility
tsne_result <- Rtsne(gene_matrix, perplexity = 30, check_duplicates = FALSE)
# Create a data frame for the t-SNE results
tsne_data <- data.frame(tsne_result$Y)
colnames(tsne_data) <- c("tSNE1", "tSNE2")
tsne_data$tissue <- join_table_3$tissue_category
tsne_data$species <- join_table_3$species
# Plot t-SNE results colored by tissue type white
tsne_tissue <- ggplot(tsne_data, aes(x = tSNE1, y = tSNE2, color = tissue)) +
  geom_point() +
  scale_color_manual(values = tissue_colors) +
  labs(title = "t-SNE colored by tissue type (p = 30)") +
  theme_minimal()
tsne_tissue

## Plot t-SNE results colored by species
tsne_species <- ggplot(tsne_data, aes(x = tSNE1, y = tSNE2, color = species)) +
  geom_point() +
  scale_color_manual(values = species_colors) +
  labs(title = "t-SNE colored by species (p = 30)") +
  theme_minimal()
tsne_species

# Plot the 2 together
grid.arrange(tsne_tissue, tsne_species, ncol = 2)

#t-SNE with perplexity 20
set.seed(100)  # Set seed for reproducibility
tsne_result_20 <- Rtsne(gene_matrix, perplexity = 20, check_duplicates = FALSE)
# Create a data frame for the t-SNE results
tsne_data_20 <- data.frame(tsne_result_20$Y)
colnames(tsne_data_20) <- c("tSNE1", "tSNE2")
tsne_data_20$tissue <- join_table_3$tissue_category
tsne_data_20$species <- join_table_3$species
# Plot t-SNE results colored by tissue type white
tsne_tissue_20 <- ggplot(tsne_data_20, aes(x = tSNE1, y = tSNE2, color = tissue)) +
  geom_point() +
  scale_color_manual(values = tissue_colors) +
  labs(title = "t-SNE colored by tissue type (p = 20)") +
  theme_minimal()
tsne_tissue_20

## Plot t-SNE results colored by species
tsne_species_20 <- ggplot(tsne_data_20, aes(x = tSNE1, y = tSNE2, color = species)) +
  geom_point() +
  scale_color_manual(values = species_colors) +
  labs(title = "t-SNE colored by species (p = 20)") +
  theme_minimal()
tsne_species_20

#Plot perplexity 20 and 30 together for all 4 plots
grid.arrange(tsne_tissue_20, tsne_tissue, tsne_species_20, tsne_species,   ncol = 2)

UMAP

#Apply UMAP on scaled matrix gene_matrix
umap_result <- umap(gene_matrix)
umap_data <- as.data.frame(umap_result$layout)
colnames(umap_data) <- c("UMAP1", "UMAP2")
umap_data$tissue <- join_table_3$tissue_category
umap_data$species <- join_table_3$species
umap_tissue <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = tissue)) +
  geom_point() +
  scale_color_manual(values = tissue_colors) +
  labs(title = "UMAP colored by tissue type") +
  theme_minimal()

umap_tissue

umap_species <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = species)) +
  geom_point() +
  scale_color_manual(values = species_colors) +
  labs(title = "UMAP colored by species") +
  theme_minimal()
umap_species

grid.arrange(umap_tissue, umap_species, ncol = 2)

# session info {.unnumbered}

R version 4.4.1 (2024-06-14) Platform: x86_64-pc-linux-gnu Running under: Ubuntu 24.04.1 LTS

Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0

locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

time zone: Europe/Madrid tzcode source: system (glibc)

attached base packages: [1] stats graphics grDevices utils datasets methods base

other attached packages: [1] rafalib_1.0.0 umap_0.2.10.0 pheatmap_1.0.12 Rtsne_0.17
[5] plotly_4.10.4 gridExtra_2.3 caret_7.0-1 lattice_0.22-5
[9] cluster_2.1.6 ggfortify_0.4.17 ggplot2_3.5.1 biomaRt_2.60.1
[13] tibble_3.2.1 dplyr_1.1.4

loaded via a namespace (and not attached): [1] DBI_1.2.3 pROC_1.18.5 httr2_1.1.1
[4] rlang_1.1.5 magrittr_2.0.3 compiler_4.4.1
[7] RSQLite_2.3.9 reshape2_1.4.4 png_0.1-8
[10] vctrs_0.6.5 stringr_1.5.1 pkgconfig_2.0.3
[13] crayon_1.5.3 fastmap_1.2.0 dbplyr_2.5.0
[16] XVector_0.44.0 labeling_0.4.3 rmarkdown_2.29
[19] prodlim_2024.06.25 UCSC.utils_1.0.0 purrr_1.0.4
[22] bit_4.6.0 xfun_0.51 zlibbioc_1.50.0
[25] cachem_1.1.0 GenomeInfoDb_1.40.1 jsonlite_1.9.1
[28] progress_1.2.3 recipes_1.1.1 blob_1.2.4
[31] parallel_4.4.1 prettyunits_1.2.0 R6_2.6.1
[34] RColorBrewer_1.1-3 bslib_0.9.0 stringi_1.8.4
[37] reticulate_1.41.0.1 parallelly_1.42.0 rpart_4.1.23
[40] lubridate_1.9.4 jquerylib_0.1.4 Rcpp_1.0.14
[43] iterators_1.0.14 knitr_1.49 future.apply_1.11.3
[46] IRanges_2.38.1 timechange_0.3.0 Matrix_1.7-1
[49] splines_4.4.1 nnet_7.3-19 tidyselect_1.2.1
[52] rstudioapi_0.17.1 yaml_2.3.10 timeDate_4041.110
[55] codetools_0.2-20 curl_6.2.1 listenv_0.9.1
[58] plyr_1.8.9 Biobase_2.64.0 withr_3.0.2
[61] KEGGREST_1.44.1 askpass_1.2.1 evaluate_1.0.3
[64] future_1.34.0 survival_3.7-0 BiocFileCache_2.12.0
[67] xml2_1.3.7 Biostrings_2.72.1 pillar_1.10.1
[70] filelock_1.0.3 foreach_1.5.2 stats4_4.4.1
[73] generics_0.1.3 S4Vectors_0.42.1 hms_1.1.3
[76] munsell_0.5.1 scales_1.3.0 globals_0.16.3
[79] class_7.3-22 glue_1.8.0 lazyeval_0.2.2
[82] tools_4.4.1 data.table_1.17.0 RSpectra_0.16-2
[85] ModelMetrics_1.2.2.2 gower_1.0.2 grid_4.4.1
[88] tidyr_1.3.1 crosstalk_1.2.1 ipred_0.9-15
[91] AnnotationDbi_1.66.0 colorspace_2.1-1 nlme_3.1-165
[94] GenomeInfoDbData_1.2.12 cli_3.6.4 rappdirs_0.3.3
[97] viridisLite_0.4.2 lava_1.8.1 gtable_0.3.6
[100] sass_0.4.9 digest_0.6.37 BiocGenerics_0.50.0
[103] farver_2.1.2 htmlwidgets_1.6.4 memoise_2.0.1
[106] htmltools_0.5.8.1 lifecycle_1.0.4 hardhat_1.4.1
[109] httr_1.4.7 openssl_2.3.2 bit64_4.6.0-1
[112] MASS_7.3-61